{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from glob import glob\n",
    "import json\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/qa/squad/ms-train-2.0.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/qa/squad/ms-train-1.1.json\n",
    "# !wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json\n",
    "# !wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json\n",
    "# !wget https://adversarialqa.github.io/data/aqa_v1.0.zip\n",
    "# !unzip aqa_v1.0.zip\n",
    "# !rm aqa_v1.0.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json\n",
    "# !wget https://f000.backblazeb2.com/file/malay-dataset/qa/squad/ms-dev-2.0.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://dl.fbaipublicfiles.com/MLQA/MLQA_V1.zip\n",
    "# !unzip MLQA_V1.zip\n",
    "# !rm MLQA_V1.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "MLQA_V1/test/test-context-en-question-en.json\n",
      "MLQA_V1/dev/dev-context-en-question-en.json\n",
      "xquad.en.json\n",
      "combined/train.json\n",
      "combined/dev.json\n",
      "train-v1.1.json\n",
      "train-v2.0.json\n",
      "ms-train-1.1.json\n",
      "ms-train-2.0.json\n"
     ]
    }
   ],
   "source": [
    "with open('train.json', 'w') as fopen_jsonl:\n",
    "    \n",
    "    files = ['MLQA_V1/test/test-context-en-question-en.json',\n",
    "             'MLQA_V1/dev/dev-context-en-question-en.json',\n",
    "             'xquad.en.json',\n",
    "             'combined/train.json',\n",
    "             'combined/dev.json',\n",
    "             'train-v1.1.json',\n",
    "             'train-v2.0.json',\n",
    "             'ms-train-1.1.json',\n",
    "             'ms-train-2.0.json']\n",
    "    \n",
    "    for f in files:\n",
    "        print(f)\n",
    "        with open(f) as fopen:\n",
    "            data = json.load(fopen)\n",
    "\n",
    "        for i in range(len(data['data'])):\n",
    "            for p in data['data'][i]['paragraphs']:\n",
    "                text = p['context']\n",
    "                if len(text.split()) > 600:\n",
    "                    continue\n",
    "                for q in p['qas']:\n",
    "                    qs = q['question']\n",
    "                    is_impossible = q.get('is_impossible', False)\n",
    "                    if is_impossible:\n",
    "                        a = 'tiada jawapan'\n",
    "                    else:\n",
    "                        a = q['answers'][0]['text']\n",
    "                    \n",
    "                    if not len(a):\n",
    "                        a = 'tiada jawapan'\n",
    "                    \n",
    "                    src = f'{text} soalan: {qs}'\n",
    "                    d = {\"translation\": {\"src\": src, \"tgt\": a, 'prefix': 'konteks: '}}\n",
    "                    fopen_jsonl.write(f'{json.dumps(d)}\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dev-v2.0.json\n",
      "ms-dev-2.0.json\n"
     ]
    }
   ],
   "source": [
    "with open('test.json', 'w') as fopen_jsonl:\n",
    "    \n",
    "    files = ['dev-v2.0.json',\n",
    "             'ms-dev-2.0.json']\n",
    "    \n",
    "    for f in files:\n",
    "        print(f)\n",
    "        with open(f) as fopen:\n",
    "            data = json.load(fopen)\n",
    "\n",
    "        for i in range(len(data['data'])):\n",
    "            for p in data['data'][i]['paragraphs']:\n",
    "                text = p['context']\n",
    "                if len(text.split()) > 600:\n",
    "                    continue\n",
    "                for q in p['qas']:\n",
    "                    qs = q['question']\n",
    "                    is_impossible = q.get('is_impossible', False)\n",
    "                    if is_impossible:\n",
    "                        a = 'tiada jawapan'\n",
    "                    else:\n",
    "                        a = q['answers'][0]['text']\n",
    "                    \n",
    "                    if not len(a):\n",
    "                        a = 'tiada jawapan'\n",
    "                    \n",
    "                    src = f'{text} soalan: {qs}'\n",
    "                    d = {\"translation\": {\"src\": src, \"tgt\": a, 'prefix': 'konteks: '}}\n",
    "                    fopen_jsonl.write(f'{json.dumps(d)}\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "!shuf train.json > shuffled-train.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "!shuf test.json > shuffled-test.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "!head -n 4000 shuffled-test.json > test-4k.json"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
